In [11]:
import torch

# Loading the model and moving it to the GPU if available
if torch.cuda.is_available():  # for nvidia GPUs etc.
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
    device = torch.device('mps')
else:
    device = torch.device('cpu')

print(device)

# Load the fine-tuned model (after training)

from transformers import AutoModelForMaskedLM, AutoConfig, DistilBertTokenizerFast

dir_model = "./model-elsevier (lr 5e-5)"
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
config = AutoConfig.from_pretrained(dir_model, output_hidden_states=True)
model_elsevier = AutoModelForMaskedLM.from_pretrained(dir_model, config=config).to(device)
mps
In [142]:
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()


def get_embeddings(vocab, model, batch_size=100):
    tokenized_words = tokenizer(vocab, return_tensors="pt",
                                padding=True, truncation=True).to(device)

    # Split tokenized words into batches
    token_batches = [{'input_ids': tokenized_words["input_ids"][i:i + batch_size],
                      'attention_mask': tokenized_words["attention_mask"][i:i + 100]} for i in range(0, len(tokenized_words["input_ids"]), batch_size)]
    
    # Initialize an empty list to store the embeddings
    all_word_embeddings = []

    # Process each batch
    for batch_tokens in tqdm(token_batches):

        # Forward pass to get embeddings for the current batch
        with torch.no_grad():
            outputs = model(input_ids=batch_tokens["input_ids"],
                            attention_mask=batch_tokens["attention_mask"])

        # Obtain embeddings for the current batch
        batch_embeddings = outputs.hidden_states[0].cpu().numpy()

        # Append to the list
        all_word_embeddings.append(batch_embeddings)

    # Concatenate the embeddings from all batches
    all_word_embeddings_concat = np.concatenate(all_word_embeddings, axis=0)

    # Take the mean of embeddings for each word
    all_word_embeddings_concat = np.mean(all_word_embeddings_concat, axis=1)
    
    return scaler.fit_transform(all_word_embeddings_concat)

# Get the vocabulary
unique_tokens_elsevier = list(tokenizer.get_vocab())


all_word_embeddings_elsevier = get_embeddings(unique_tokens_elsevier,
                                              model_elsevier)
print(all_word_embeddings_elsevier.shape)
100%|█████████████████████████████████████████| 306/306 [00:21<00:00, 13.93it/s]
(30522, 768)
In [143]:
# Load reddit model
dir_model = "./model-redit(lr 5e-5)"
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
config = AutoConfig.from_pretrained(dir_model, output_hidden_states=True)
model_reddit = AutoModelForMaskedLM.from_pretrained(dir_model, config=config).to(device)

# Get the vocabulary
unique_tokens_reddit = list(tokenizer.get_vocab())

all_word_embeddings_reddit = get_embeddings(unique_tokens_reddit,
                                            model_reddit)
print(all_word_embeddings_reddit.shape)
100%|█████████████████████████████████████████| 306/306 [00:21<00:00, 14.42it/s]
(30522, 768)
In [6]:
# Read the file and extract words and vectors
with open('vectors.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Get glove vector from file txt
glove_vectors = {}

for line in tqdm(lines):
    parts = line.split()
    word = parts[0]
    vector = np.array([float(x) for x in parts[1:]])
    glove_vectors[word] = vector

# Extract vectors
unique_tokens_glove = list(glove_vectors.keys())

# Convert the list of vectors to a NumPy array and normalize
all_word_embeddings_glove = scaler.fit_transform(np.array(list(glove_vectors.values())))

print(all_word_embeddings_glove.shape)
100%|████████████████████████████████| 100001/100001 [00:07<00:00, 13382.02it/s]
(100001, 768)
In [14]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Perform t-SNE dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d_elsevier = tsne.fit_transform(all_word_embeddings_elsevier)
In [15]:
# Plot the 2D embeddings
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d_elsevier[:, 0], embeddings_2d_elsevier[:, 1], s=5)
plt.title('t-SNE Visualization of Word Embeddings')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()
In [13]:
# Perform same 2d plot to compare
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d_reddit = tsne.fit_transform(all_word_embeddings_reddit)
In [14]:
# Plot the 2D embeddings
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d_reddit[:, 0], embeddings_2d_reddit[:, 1], s=5)
plt.title('t-SNE Visualization of Word Embeddings')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()
In [7]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Perform same 2d plot to compare
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d_glove = tsne.fit_transform(all_word_embeddings_glove)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
In [19]:
# Plot the 2D embeddings
plt.figure(figsize=(10, 8))
plt.scatter(all_word_embeddings_glove[:, 0], all_word_embeddings_glove[:, 1], s=5)
plt.title('t-SNE Visualization of Word Embeddings')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()

Lets load a dataset with categorioes in order to get more information from plots and later evaluate the different embeddings we obtained.

In [16]:
import pandas as pd


categories_df = pd.read_csv('https://raw.githubusercontent.com/vecto-ai/word-benchmarks/master/word-categorization/monolingual/en/ap.csv',
                            index_col=0)

categories = {}
for category in set(categories_df["category"]):
    categories[category] = categories_df.loc[categories_df["category"] == category, "word"].dropna().values

categories["animal"]
Out[16]:
array(['bear', 'bull', 'camel', 'cat', 'cow', 'deer', 'dog', 'elephant',
       'horse', 'kitten', 'lion', 'monkey', 'mouse', 'oyster', 'puppy',
       'rat', 'sheep', 'tiger', 'turtle', 'zebra'], dtype=object)
In [17]:
# Plot the embeddings with a category in red

category = "feeling"

plt.figure(figsize=(10, 8))
scatter = plt.scatter(embeddings_2d_elsevier[:, 0], embeddings_2d_elsevier[:, 1], s=5, c='lightblue')  # s is the marker size

for word, (x, y) in zip(unique_tokens_elsevier, embeddings_2d_elsevier):
    if word in categories[category]:
        index = unique_tokens_elsevier.index(word)
        scatter = plt.scatter(x, y, s=5, c='red', label=word)
    
plt.title('t-SNE Visualization of Word Embeddings')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()
In [20]:
# Perform t-SNE with 3 dimensions
tsne = TSNE(n_components=3, random_state=42)
embeddings_3d_elsevier = tsne.fit_transform(all_word_embeddings_elsevier)
In [21]:
import plotly.express as px

fig = px.scatter_3d(x=embeddings_3d_elsevier[:, 0],
                    y=embeddings_3d_elsevier[:, 1],
                    z=embeddings_3d_elsevier[:, 2])
fig.update_traces(marker_size = 2)
fig.show()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

As you can see in the previous plot, there is a huge accumulation and some outliers. The outliers are making it difficult to visualize the accumulation, so lets remove them.

In [22]:
import numpy as np
from scipy.stats import zscore

# Calculate z-scores for each dimension
z_scores = zscore(embeddings_3d_elsevier, axis=0)

# Set a threshold for z-scores to identify outliers (adjust as needed)
threshold = 2.5

# Identify indices of non-outliers
non_outlier_indices = np.all(np.abs(z_scores) < threshold, axis=1)

# Filter data to exclude outliers
filtered_embeddings = embeddings_3d_elsevier[non_outlier_indices]

fig = px.scatter_3d(x=filtered_embeddings[:, 0],
                    y=filtered_embeddings[:, 1],
                    z=filtered_embeddings[:, 2])

fig.update_traces(marker_size = 2)
fig.show()
In [23]:
# Plot in red a category

import plotly.graph_objects as go

category = "feeling"

fig = px.scatter_3d(x=filtered_embeddings[:, 0],
                    y=filtered_embeddings[:, 1],
                    z=filtered_embeddings[:, 2],
                    opacity=0.5)

# Change the color of specific words
for word, (x, y, z) in zip(unique_tokens_elsevier, embeddings_3d_elsevier):
    if word in categories[category]:
        fig.add_trace(
            go.Scatter3d(x=[x],
                         y=[y],
                         z=[z],
                         mode="markers",
                         marker=dict(
                             size=12,
                             color="red")))
        
fig.update_traces(marker_size = 2)
fig.update_layout(showlegend=False)
fig.show()

In this 3d representation, the categories are more grouped together.

In order to evaluate the embeddings, lets create a function to get the first n words given an analogy. For example, if woman-men, queen-?, which n words will be fit the analogy. In other words, if we calculate the distance between woman and meen in the vector representation and we move this distance from queen, we should get king.

In [45]:
def get_embeddings_word(word, vocab, embeddings, method="transformer"):
    if method == "glove":
        return glove_vectors[word]
    
    return embeddings[vocab.index(word)]
In [46]:
from sklearn.metrics.pairwise import cosine_similarity

def get_analogy(w1, w2, w3, vocab, embeddings, n, method="transformer"):
    embedding_w1 = get_embeddings_word(w1, vocab, embeddings, method=method)
    
    embedding_w2 = get_embeddings_word(w2, vocab, embeddings, method=method)
    
    embedding_w3 = get_embeddings_word(w3, vocab, embeddings, method=method)

    analogy_vector = embedding_w1 - embedding_w2

    embedding_result = embedding_w3 + analogy_vector

    # Compute cosine similarity between embedding_king_prime and all other word embeddings
    similarities = cosine_similarity([embedding_result], embeddings)
    
    # Get the indices of the top X most similar words
    top_indices = similarities.argsort()[0][-1-n:][::-1]

    # Retrieve the top X words
    return [vocab[i] for i in top_indices if vocab[i] != w3]

get_analogy("woman", "man", "king", unique_tokens_elsevier, all_word_embeddings_elsevier, 3)
Out[46]:
['queen', 'woman', 'kings']
In [47]:
get_analogy("woman", "man", "king",
            unique_tokens_glove, all_word_embeddings_glove, n=5, method="glove")
Out[47]:
['woman', 'floating_adj', 'earmarked', 'dedham', 'swollen_verb']

Lets load a analogies dataset and evaluate our embeddings

In [48]:
import pandas as pd
analogy_df = pd.read_csv('https://raw.githubusercontent.com/vecto-ai/word-benchmarks/master/word-analogy/monolingual/en/google-analogies.csv',
                        index_col=0)

def eval_analogies(vocab, embeddings, x=len(analogy_df), method="glove"):
    punct = 0
        
    for i in tqdm(range(len(analogy_df[:x]))):
        if all([word.lower() in vocab for word in analogy_df.loc[i, ["word1", "word2", "word3", "target"]].values]):
            target = analogy_df.loc[i, "target"].lower()
        
            similar_words = get_analogy(analogy_df.loc[i, "word1"].lower(),
                                        analogy_df.loc[i, "word2"].lower(),
                                        analogy_df.loc[i, "word3"].lower(),
                                        vocab,
                                        embeddings, 5)
            similar_words.reverse()

            if target in similar_words:
                punct = punct + similar_words.index(target) + 1
            
    return punct
In [49]:
analogies_elsevier = eval_analogies(unique_tokens_elsevier, all_word_embeddings_elsevier)
analogies_elsevier
100%|█████████████████████████████████████| 19544/19544 [08:08<00:00, 39.98it/s]
Out[49]:
19267
In [50]:
analogies_reddit = eval_analogies(unique_tokens_reddit, all_word_embeddings_reddit)
analogies_reddit
100%|█████████████████████████████████████| 19544/19544 [08:23<00:00, 38.83it/s]
Out[50]:
16258
In [31]:
analogies_glove = eval_analogies(unique_tokens_glove, all_word_embeddings_glove,
                                 x=1000, method="glove")
analogies_glove
100%|███████████████████████████████████████| 1000/1000 [02:42<00:00,  6.16it/s]
Out[31]:
0
In [78]:
from sklearn.metrics.pairwise import cosine_similarity

def predict_category(word, vocab, embeddings, threshold=0.7):
    
    # Compute category centroids
    category_centroids = {
        category: get_category_centroids(words, vocab, embeddings)
        for category, words in categories.items()
    }
    
    max_similarity = 0
    
    if word in vocab:
        word_embedding = get_embeddings_word(word, vocab, embeddings)
        
        max_category = {}
        for category in category_centroids:
            similarities = cosine_similarity([word_embedding], category_centroids[category])
            max_category[category] = similarities.max()

        max_similarity = max(max_category.values())
    
    if max_similarity > threshold:
        predicted_category = max(max_category, key=max_category.get)
        return predicted_category
    else:
        return "Uncategorized"

def get_category_centroids(words, vocab, embeddings):
    category_embedding = []
    for word in words:
        if word in vocab:
            category_embedding.append(get_embeddings_word(word, vocab, embeddings))
    return np.vstack(category_embedding)
            


# Usage:
word_to_predict = "elephant"
predicted_category = predict_category(word_to_predict, unique_tokens_elsevier, all_word_embeddings_elsevier)
print(f"The predicted category for '{word_to_predict}' is '{predicted_category}'.")
The predicted category for 'elephant' is 'animal'.
In [39]:
def eval_categories(vocab, embeddings):
    punct = 0
    
    for i in tqdm(range(len(categories_df))):
        
        if categories_df.loc[i, "category"] != float("nan") or categories_df.loc[i, "word"] != float("nan"):
            observed_category = categories_df.loc[i, "category"].lower()


            predicted_category = predict_category(categories_df.loc[i, "word"],
                                                  vocab,
                                                  embeddings)

            if observed_category == predicted_category:
                punct = punct + 1
    return punct
In [40]:
categories_elsevier = eval_categories(unique_tokens_elsevier, all_word_embeddings_elsevier)
categories_elsevier
100%|█████████████████████████████████████████| 423/423 [00:38<00:00, 11.10it/s]
Out[40]:
285
In [41]:
categories_reddit = eval_categories(unique_tokens_reddit, all_word_embeddings_reddit)
categories_reddit
100%|█████████████████████████████████████████| 423/423 [00:38<00:00, 10.95it/s]
Out[41]:
285
In [144]:
def get_embeddings_concept(words, vocab, all_embeddings):
    embeddings_words = np.empty((0, 768))
    
    for word in words:
        embeddings_word = get_embeddings_word(word, vocab, all_embeddings)
        embeddings_words = np.vstack([embeddings_words, embeddings_word])


    # Take the mean of embeddings
    return words, np.mean(embeddings_words, axis=0)
In [145]:
embedding_elsevier_losses = get_embeddings_concept(["losses", "loss", "lost", "losing", "lose", "loses"],
                                                unique_tokens_elsevier, all_word_embeddings_elsevier)
embedding_reddit_losses = get_embeddings_concept(["losses", "loss", "lost", "losing", "lose", "loses"], 
                                                 unique_tokens_reddit, all_word_embeddings_reddit)

vector_elsevier = embedding_elsevier_losses[1].reshape(1, -1)
vector_reddit = embedding_reddit_losses[1].reshape(1, -1)

cosine_similarity_score = cosine_similarity(vector_reddit, vector_elsevier)

print("Cosine Similarity:", cosine_similarity_score[0][0])


from numpy.linalg import norm

A=embedding_elsevier_losses[1]
B=embedding_reddit_losses[1]

cosine = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cosine)


dot_p = np.dot(A,B)
print("Dot product :", dot_p)
Cosine Similarity: 0.9639859542835594
Cosine Similarity: 0.9639859542835584
Dot product : 197.98296380900612
In [146]:
import string

def get_similar_words(aggregate_embeddings, vocab, all_embeddings, n):
    words = aggregate_embeddings[0]
    
    # Compute cosine similarity between embedding_king_prime and all other word embeddings
    similarities = cosine_similarity([aggregate_embeddings[1]], all_embeddings)
    
    # Get the indices of the top X most similar words
    indices = similarities.argsort()[0][:][::-1]
    
    similar_words = []
    i = 0
    
    while len(similar_words) != n:
        similar_word = vocab[indices[i]]
        if any(char in string.ascii_lowercase for char in similar_word) and similar_word not in words:
            similar_words.append(similar_word)
        i = i + 1
    
     
    return similar_words

get_similar_words(embedding_elsevier_losses,
                  unique_tokens_elsevier,
                  all_word_embeddings_elsevier, 10)
Out[146]:
['regained',
 'regaining',
 'casualties',
 'relinquished',
 'lineman',
 'ceded',
 'regain',
 'scowled',
 'injuring',
 'victories']
In [147]:
get_similar_words(embedding_reddit_losses,
                  unique_tokens_reddit, all_word_embeddings_reddit, 10)
Out[147]:
['regained',
 'regain',
 'regaining',
 'ceded',
 'retain',
 'gained',
 'gaining',
 'relinquished',
 'destroys',
 'retains']
In [148]:
embedding_glove_losses = get_embeddings_concept(["losses", "loss", "lost", "losing", "lose", "loses"], 
                                                 unique_tokens_glove, all_word_embeddings_glove)

get_similar_words(embedding_glove_losses,
                  unique_tokens_glove, all_word_embeddings_glove, 10)
Out[148]:
['<unk>',
 'limping',
 'teddy_noun',
 'lobed_verb',
 'superannuation_noun',
 'beekman',
 'meddle',
 'participants_noun',
 'balked_verb',
 'kk_noun']
In [215]:
related_words = ["pay", "payment", "tax", "taxes", "gamble",
                 "monetary", "money", "purchase", "trade",
                 "trading", "gain",  "grief", "negative", "sentimental"]

def get_distance_from_concept(words, concept_embeddings, vocab, embeddings):
    
    distance = {}
    for word in words:
        embeddings_word = get_embeddings_word(word, vocab, embeddings)
        cosine_similarity_score = cosine_similarity(embeddings_word.reshape(1, -1),
                                                    concept_embeddings.reshape(1, -1))
        
        dot_p = np.dot(embeddings_word, concept_embeddings)
        
        distance[word] = (cosine_similarity_score[0][0], dot_p)
    return distance
        
distance_concept_elsevier = get_distance_from_concept(related_words, embedding_elsevier_losses[1],
                          unique_tokens_elsevier, all_word_embeddings_elsevier)
distance_concept_elsevier
Out[215]:
{'pay': (0.976685450134017, 202.48150320327534),
 'payment': (0.9795798593239768, 201.03050568760682),
 'tax': (0.9769047155841063, 202.31815508921716),
 'taxes': (0.9801746978088899, 200.84567882997297),
 'gamble': (0.9814993664983425, 201.71005262240448),
 'monetary': (0.9791837669040321, 200.97352997847997),
 'money': (0.9766291317097666, 202.51230657035964),
 'purchase': (0.9788920054119846, 201.54759388989274),
 'trade': (0.9777536996312579, 202.98090830094583),
 'trading': (0.9779334863416884, 201.5486211991196),
 'gain': (0.9834219178256203, 203.47531728225005),
 'grief': (0.9854000723591345, 201.52914524830607),
 'negative': (0.9792852335941129, 202.35542841652358),
 'sentimental': (0.9867355109991538, 202.91185808372697)}
In [216]:
distance_concept_reddit = get_distance_from_concept(related_words, embedding_reddit_losses[1],
                                                    unique_tokens_reddit, all_word_embeddings_reddit)
distance_concept_reddit
Out[216]:
{'pay': (0.9783330549429938, 202.84510952451416),
 'payment': (0.9792008952505205, 202.11917536586157),
 'tax': (0.9783460494492402, 202.89668067797345),
 'taxes': (0.9804420284383639, 201.24236903036586),
 'gamble': (0.9822085261322759, 202.98225621427594),
 'monetary': (0.9805221310226453, 201.69854234479106),
 'money': (0.9784580044630313, 201.92124390420213),
 'purchase': (0.9800533481079252, 202.4430386701623),
 'trade': (0.979102189727739, 203.17395386707813),
 'trading': (0.9792906392765106, 202.20434713114173),
 'gain': (0.984792278270893, 203.58430323622918),
 'grief': (0.9826914948523917, 202.28639495461636),
 'negative': (0.9808707301004688, 202.54485887176503),
 'sentimental': (0.9822289429984885, 202.32761765047832)}
In [217]:
keys = list(distance_concept_reddit.keys())
cosine_dist_reddit = [value[0] for value in list(distance_concept_reddit.values())]
dot_product_reddit = [value[1] for value in list(distance_concept_reddit.values())]

cosine_dist_elsevier = [value[0] for value in list(distance_concept_elsevier.values())]
dot_product_elsevier = [value[1] for value in list(distance_concept_elsevier.values())]
In [218]:
plt.figure(figsize=(10,6))
plt.barh(np.arange(len(keys))-0.2, cosine_dist_reddit, height= 0.4, label="Reddit")
plt.barh(np.arange(len(keys))+0.2, cosine_dist_elsevier, height= 0.4, label="Elsevier")
plt.yticks(np.arange(len(keys)), keys)
plt.xlim((0.975, 0.99))
plt.legend()
plt.show()
In [219]:
plt.figure(figsize=(10,6))
plt.barh(np.arange(len(keys))-0.2, dot_product_reddit, height= 0.4, label="Reddit")
plt.barh(np.arange(len(keys))+0.2, dot_product_elsevier, height= 0.4, label="Elsevier")
plt.yticks(np.arange(len(keys)), keys)
plt.xlim((200, 205))
plt.legend()
plt.show()
In [ ]: